{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "e39c55b4-8c15-4e21-a7e0-480605ec1289",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T13:32:27.713973Z",
     "iopub.status.busy": "2022-06-21T13:32:27.713357Z",
     "iopub.status.idle": "2022-06-21T13:32:27.745049Z",
     "shell.execute_reply": "2022-06-21T13:32:27.744549Z",
     "shell.execute_reply.started": "2022-06-21T13:32:27.713922Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>236</th>\n",
       "      <th>237</th>\n",
       "      <th>238</th>\n",
       "      <th>239</th>\n",
       "      <th>240</th>\n",
       "      <th>241</th>\n",
       "      <th>242</th>\n",
       "      <th>243</th>\n",
       "      <th>244</th>\n",
       "      <th>245</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>b'834aafbd92fcd72d2a36b52ec53751a5'</td>\n",
       "      <td>b'7ba9426e097abc45f54ecb1e943c9e7d'</td>\n",
       "      <td>b'44914d916c57cfda8a2ce7fcf224cae0'</td>\n",
       "      <td>b'5f9ce39aec46f3e8e8aebbc722d8ceeb'</td>\n",
       "      <td>6376642</td>\n",
       "      <td>-0.125558</td>\n",
       "      <td>0.154679</td>\n",
       "      <td>0.160516</td>\n",
       "      <td>0.154543</td>\n",
       "      <td>...</td>\n",
       "      <td>0.433675</td>\n",
       "      <td>0.401416</td>\n",
       "      <td>-0.423497</td>\n",
       "      <td>0.439084</td>\n",
       "      <td>0.433675</td>\n",
       "      <td>0.401416</td>\n",
       "      <td>-0.423497</td>\n",
       "      <td>0.439084</td>\n",
       "      <td>0.433675</td>\n",
       "      <td>0.401416</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 246 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0                                    1    \\\n",
       "0    0  b'834aafbd92fcd72d2a36b52ec53751a5'   \n",
       "\n",
       "                                   2                                    3    \\\n",
       "0  b'7ba9426e097abc45f54ecb1e943c9e7d'  b'44914d916c57cfda8a2ce7fcf224cae0'   \n",
       "\n",
       "                                   4        5         6         7         8    \\\n",
       "0  b'5f9ce39aec46f3e8e8aebbc722d8ceeb'  6376642 -0.125558  0.154679  0.160516   \n",
       "\n",
       "        9    ...       236       237       238       239       240       241  \\\n",
       "0  0.154543  ...  0.433675  0.401416 -0.423497  0.439084  0.433675  0.401416   \n",
       "\n",
       "        242       243       244       245  \n",
       "0 -0.423497  0.439084  0.433675  0.401416  \n",
       "\n",
       "[1 rows x 246 columns]"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import codecs\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\n",
    "    \"创意视角下的数字广告CTR预估挑战赛公开数据/train_data.txt\",\n",
    "    nrows=10,\n",
    "    header=None,\n",
    ")\n",
    "df.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "54e88458-a604-4618-be0f-b992158d7c6f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T13:53:03.764216Z",
     "iopub.status.busy": "2022-06-21T13:53:03.763656Z",
     "iopub.status.idle": "2022-06-21T13:53:44.517540Z",
     "shell.execute_reply": "2022-06-21T13:53:44.516470Z",
     "shell.execute_reply.started": "2022-06-21T13:53:03.764170Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv(\n",
    "    \"创意视角下的数字广告CTR预估挑战赛公开数据/train_data.txt\",\n",
    "    low_memory=True,\n",
    "    header=None,\n",
    "    usecols=['label','pkgname','ver','slotid','mediaid','material'],\n",
    "    names=['label','pkgname','ver','slotid','mediaid','material'],\n",
    "    nrows=None,\n",
    "    dtype={\n",
    "        'label': \"category\",\n",
    "        'pkgname': \"category\",\n",
    "        'ver': \"category\",\n",
    "        'slotid': \"category\",\n",
    "        'mediaid': \"category\",\n",
    "        'material': \"category\",\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "0c641f20-f609-4cc4-9c63-1e41b961c41e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T13:55:04.767033Z",
     "iopub.status.busy": "2022-06-21T13:55:04.766436Z",
     "iopub.status.idle": "2022-06-21T13:55:04.835286Z",
     "shell.execute_reply": "2022-06-21T13:55:04.834270Z",
     "shell.execute_reply.started": "2022-06-21T13:55:04.766983Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_df = train_df[train_df['label'] != 'adx_slot_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "1a89c566-c1df-443e-863f-b3be775f329e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T13:55:50.868583Z",
     "iopub.status.busy": "2022-06-21T13:55:50.867990Z",
     "iopub.status.idle": "2022-06-21T14:01:02.588691Z",
     "shell.execute_reply": "2022-06-21T14:01:02.588149Z",
     "shell.execute_reply.started": "2022-06-21T13:55:50.868530Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data = np.loadtxt(\n",
    "    \"创意视角下的数字广告CTR预估挑战赛公开数据/train_data.txt\",\n",
    "    dtype=np.float32,\n",
    "    comments=\"adx_slot_id\",\n",
    "    delimiter=\",\",\n",
    "    usecols=list(range(6, 246)),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "77ef953a-8744-4ae1-aa94-71820387a519",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T14:01:18.767244Z",
     "iopub.status.busy": "2022-06-21T14:01:18.766436Z",
     "iopub.status.idle": "2022-06-21T14:01:20.115492Z",
     "shell.execute_reply": "2022-06-21T14:01:20.114951Z",
     "shell.execute_reply.started": "2022-06-21T14:01:18.767170Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_df.to_parquet('train_df.parquet')\n",
    "np.save('train_data.npy', train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "298a9a6f-5f26-42b6-a214-406aa3778b7d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T14:03:53.766693Z",
     "iopub.status.busy": "2022-06-21T14:03:53.766092Z",
     "iopub.status.idle": "2022-06-21T14:05:40.688131Z",
     "shell.execute_reply": "2022-06-21T14:05:40.687578Z",
     "shell.execute_reply.started": "2022-06-21T14:03:53.766642Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_df = pd.read_csv(\n",
    "    \"创意视角下的数字广告CTR预估挑战赛公开数据/test_data.txt\",\n",
    "    low_memory=True,\n",
    "    header=None,\n",
    "    usecols=['pkgname','ver','slotid','mediaid','material'],\n",
    "    names=['pkgname','ver','slotid','mediaid','material'],\n",
    "    nrows=None,\n",
    "    dtype={\n",
    "        'pkgname': \"category\",\n",
    "        'ver': \"category\",\n",
    "        'slotid': \"category\",\n",
    "        'mediaid': \"category\",\n",
    "        'material': \"category\",\n",
    "    },\n",
    ")\n",
    "\n",
    "test_data = np.loadtxt(\n",
    "    \"创意视角下的数字广告CTR预估挑战赛公开数据/test_data.txt\",\n",
    "    dtype=np.float32,\n",
    "    comments=\"adx_slot_id\",\n",
    "    delimiter=\",\",\n",
    "    usecols=list(range(5, 245)),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "ebc15168-64d4-43e4-bec0-436e11dd5d1a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T14:05:45.031254Z",
     "iopub.status.busy": "2022-06-21T14:05:45.030670Z",
     "iopub.status.idle": "2022-06-21T14:05:45.469034Z",
     "shell.execute_reply": "2022-06-21T14:05:45.468424Z",
     "shell.execute_reply.started": "2022-06-21T14:05:45.031205Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_df.to_parquet('test_df.parquet')\n",
    "np.save('test_data.npy', test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "5e09527e-7d5d-4cb0-bb9e-b2290ac1b6d4",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T14:03:48.956950Z",
     "iopub.status.busy": "2022-06-21T14:03:48.956367Z",
     "iopub.status.idle": "2022-06-21T14:03:48.964004Z",
     "shell.execute_reply": "2022-06-21T14:03:48.962910Z",
     "shell.execute_reply.started": "2022-06-21T14:03:48.956900Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((2219537, 6), (2219537, 240))"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.shape, train_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "878f1171-97bf-4795-8cdf-9ff515406e9a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T14:05:53.200971Z",
     "iopub.status.busy": "2022-06-21T14:05:53.200373Z",
     "iopub.status.idle": "2022-06-21T14:05:53.206741Z",
     "shell.execute_reply": "2022-06-21T14:05:53.205958Z",
     "shell.execute_reply.started": "2022-06-21T14:05:53.200920Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((666351, 5), (666351, 240))"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.shape, test_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "14392007-d73f-4eb1-8e1d-36f5c83c4fd3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-06-21T14:06:07.769255Z",
     "iopub.status.busy": "2022-06-21T14:06:07.768673Z",
     "iopub.status.idle": "2022-06-21T14:06:07.967213Z",
     "shell.execute_reply": "2022-06-21T14:06:07.966743Z",
     "shell.execute_reply.started": "2022-06-21T14:06:07.769205Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1133, 4)"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lines = codecs.open('创意视角下的数字广告CTR预估挑战赛公开数据/creative.txt').readlines()\n",
    "creative = pd.DataFrame([x.strip().split('\\t') for x in lines])\n",
    "creative.columns = ['material', 'title', 'desc', 'img']\n",
    "\n",
    "creative.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8310fc22-6eb7-4196-bc91-7acba3299840",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
